In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
In [2]:
df=pd.read_csv('marketing_campaign.csv')
print(df)
         ID  Year_Birth   Education Marital_Status   Income  Kidhome  \
0      5524        1957  Graduation         Single  58138.0        0   
1      2174        1954  Graduation         Single  46344.0        1   
2      4141        1965  Graduation       Together  71613.0        0   
3      6182        1984  Graduation       Together  26646.0        1   
4      5324        1981         PhD        Married  58293.0        1   
...     ...         ...         ...            ...      ...      ...   
2235  10870        1967  Graduation        Married  61223.0        0   
2236   4001        1946         PhD       Together  64014.0        2   
2237   7270        1981  Graduation       Divorced  56981.0        0   
2238   8235        1956      Master       Together  69245.0        0   
2239   9405        1954         PhD        Married  52869.0        1   

      Teenhome Dt_Customer  Recency  MntWines  ...  NumWebVisitsMonth  \
0            0    4/9/2012       58       635  ...                  7   
1            1    8/3/2014       38        11  ...                  5   
2            0  21-08-2013       26       426  ...                  4   
3            0   10/2/2014       26        11  ...                  6   
4            0  19-01-2014       94       173  ...                  5   
...        ...         ...      ...       ...  ...                ...   
2235         1  13-06-2013       46       709  ...                  5   
2236         1   10/6/2014       56       406  ...                  7   
2237         0  25-01-2014       91       908  ...                  6   
2238         1  24-01-2014        8       428  ...                  3   
2239         1  15-10-2012       40        84  ...                  7   

      AcceptedCmp3  AcceptedCmp4  AcceptedCmp5  AcceptedCmp1  AcceptedCmp2  \
0                0             0             0             0             0   
1                0             0             0             0             0   
2                0             0             0             0             0   
3                0             0             0             0             0   
4                0             0             0             0             0   
...            ...           ...           ...           ...           ...   
2235             0             0             0             0             0   
2236             0             0             0             1             0   
2237             0             1             0             0             0   
2238             0             0             0             0             0   
2239             0             0             0             0             0   

      Complain  Z_CostContact  Z_Revenue  Response  
0            0              3         11         1  
1            0              3         11         0  
2            0              3         11         0  
3            0              3         11         0  
4            0              3         11         0  
...        ...            ...        ...       ...  
2235         0              3         11         0  
2236         0              3         11         0  
2237         0              3         11         0  
2238         0              3         11         0  
2239         0              3         11         1  

[2240 rows x 29 columns]
In [3]:
df.head()
Out[3]:
ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer Recency MntWines ... NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
0 5524 1957 Graduation Single 58138.0 0 0 4/9/2012 58 635 ... 7 0 0 0 0 0 0 3 11 1
1 2174 1954 Graduation Single 46344.0 1 1 8/3/2014 38 11 ... 5 0 0 0 0 0 0 3 11 0
2 4141 1965 Graduation Together 71613.0 0 0 21-08-2013 26 426 ... 4 0 0 0 0 0 0 3 11 0
3 6182 1984 Graduation Together 26646.0 1 0 10/2/2014 26 11 ... 6 0 0 0 0 0 0 3 11 0
4 5324 1981 PhD Married 58293.0 1 0 19-01-2014 94 173 ... 5 0 0 0 0 0 0 3 11 0

5 rows × 29 columns

In [4]:
df.tail()
Out[4]:
ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer Recency MntWines ... NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
2235 10870 1967 Graduation Married 61223.0 0 1 13-06-2013 46 709 ... 5 0 0 0 0 0 0 3 11 0
2236 4001 1946 PhD Together 64014.0 2 1 10/6/2014 56 406 ... 7 0 0 0 1 0 0 3 11 0
2237 7270 1981 Graduation Divorced 56981.0 0 0 25-01-2014 91 908 ... 6 0 1 0 0 0 0 3 11 0
2238 8235 1956 Master Together 69245.0 0 1 24-01-2014 8 428 ... 3 0 0 0 0 0 0 3 11 0
2239 9405 1954 PhD Married 52869.0 1 1 15-10-2012 40 84 ... 7 0 0 0 0 0 0 3 11 1

5 rows × 29 columns

In [5]:
df.shape
Out[5]:
(2240, 29)
In [6]:
df.isnull().sum()
Out[6]:
ID                      0
Year_Birth              0
Education               0
Marital_Status          0
Income                 24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
AcceptedCmp3            0
AcceptedCmp4            0
AcceptedCmp5            0
AcceptedCmp1            0
AcceptedCmp2            0
Complain                0
Z_CostContact           0
Z_Revenue               0
Response                0
dtype: int64
In [7]:
df.dropna(inplace=True)
In [8]:
df.isnull().sum()
Out[8]:
ID                     0
Year_Birth             0
Education              0
Marital_Status         0
Income                 0
Kidhome                0
Teenhome               0
Dt_Customer            0
Recency                0
MntWines               0
MntFruits              0
MntMeatProducts        0
MntFishProducts        0
MntSweetProducts       0
MntGoldProds           0
NumDealsPurchases      0
NumWebPurchases        0
NumCatalogPurchases    0
NumStorePurchases      0
NumWebVisitsMonth      0
AcceptedCmp3           0
AcceptedCmp4           0
AcceptedCmp5           0
AcceptedCmp1           0
AcceptedCmp2           0
Complain               0
Z_CostContact          0
Z_Revenue              0
Response               0
dtype: int64
In [9]:
df
Out[9]:
ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer Recency MntWines ... NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
0 5524 1957 Graduation Single 58138.0 0 0 4/9/2012 58 635 ... 7 0 0 0 0 0 0 3 11 1
1 2174 1954 Graduation Single 46344.0 1 1 8/3/2014 38 11 ... 5 0 0 0 0 0 0 3 11 0
2 4141 1965 Graduation Together 71613.0 0 0 21-08-2013 26 426 ... 4 0 0 0 0 0 0 3 11 0
3 6182 1984 Graduation Together 26646.0 1 0 10/2/2014 26 11 ... 6 0 0 0 0 0 0 3 11 0
4 5324 1981 PhD Married 58293.0 1 0 19-01-2014 94 173 ... 5 0 0 0 0 0 0 3 11 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2235 10870 1967 Graduation Married 61223.0 0 1 13-06-2013 46 709 ... 5 0 0 0 0 0 0 3 11 0
2236 4001 1946 PhD Together 64014.0 2 1 10/6/2014 56 406 ... 7 0 0 0 1 0 0 3 11 0
2237 7270 1981 Graduation Divorced 56981.0 0 0 25-01-2014 91 908 ... 6 0 1 0 0 0 0 3 11 0
2238 8235 1956 Master Together 69245.0 0 1 24-01-2014 8 428 ... 3 0 0 0 0 0 0 3 11 0
2239 9405 1954 PhD Married 52869.0 1 1 15-10-2012 40 84 ... 7 0 0 0 0 0 0 3 11 1

2216 rows × 29 columns

In [10]:
print(df.duplicated().sum())
0
In [11]:
df.columns
Out[11]:
Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response'],
      dtype='object')
In [12]:
df.columns=df.columns.str.strip()
df
Out[12]:
ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer Recency MntWines ... NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
0 5524 1957 Graduation Single 58138.0 0 0 4/9/2012 58 635 ... 7 0 0 0 0 0 0 3 11 1
1 2174 1954 Graduation Single 46344.0 1 1 8/3/2014 38 11 ... 5 0 0 0 0 0 0 3 11 0
2 4141 1965 Graduation Together 71613.0 0 0 21-08-2013 26 426 ... 4 0 0 0 0 0 0 3 11 0
3 6182 1984 Graduation Together 26646.0 1 0 10/2/2014 26 11 ... 6 0 0 0 0 0 0 3 11 0
4 5324 1981 PhD Married 58293.0 1 0 19-01-2014 94 173 ... 5 0 0 0 0 0 0 3 11 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2235 10870 1967 Graduation Married 61223.0 0 1 13-06-2013 46 709 ... 5 0 0 0 0 0 0 3 11 0
2236 4001 1946 PhD Together 64014.0 2 1 10/6/2014 56 406 ... 7 0 0 0 1 0 0 3 11 0
2237 7270 1981 Graduation Divorced 56981.0 0 0 25-01-2014 91 908 ... 6 0 1 0 0 0 0 3 11 0
2238 8235 1956 Master Together 69245.0 0 1 24-01-2014 8 428 ... 3 0 0 0 0 0 0 3 11 0
2239 9405 1954 PhD Married 52869.0 1 1 15-10-2012 40 84 ... 7 0 0 0 0 0 0 3 11 1

2216 rows × 29 columns

In [13]:
df.dtypes
Out[13]:
ID                       int64
Year_Birth               int64
Education               object
Marital_Status          object
Income                 float64
Kidhome                  int64
Teenhome                 int64
Dt_Customer             object
Recency                  int64
MntWines                 int64
MntFruits                int64
MntMeatProducts          int64
MntFishProducts          int64
MntSweetProducts         int64
MntGoldProds             int64
NumDealsPurchases        int64
NumWebPurchases          int64
NumCatalogPurchases      int64
NumStorePurchases        int64
NumWebVisitsMonth        int64
AcceptedCmp3             int64
AcceptedCmp4             int64
AcceptedCmp5             int64
AcceptedCmp1             int64
AcceptedCmp2             int64
Complain                 int64
Z_CostContact            int64
Z_Revenue                int64
Response                 int64
dtype: object
In [14]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 2216 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2216 non-null   int64  
 1   Year_Birth           2216 non-null   int64  
 2   Education            2216 non-null   object 
 3   Marital_Status       2216 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2216 non-null   int64  
 6   Teenhome             2216 non-null   int64  
 7   Dt_Customer          2216 non-null   object 
 8   Recency              2216 non-null   int64  
 9   MntWines             2216 non-null   int64  
 10  MntFruits            2216 non-null   int64  
 11  MntMeatProducts      2216 non-null   int64  
 12  MntFishProducts      2216 non-null   int64  
 13  MntSweetProducts     2216 non-null   int64  
 14  MntGoldProds         2216 non-null   int64  
 15  NumDealsPurchases    2216 non-null   int64  
 16  NumWebPurchases      2216 non-null   int64  
 17  NumCatalogPurchases  2216 non-null   int64  
 18  NumStorePurchases    2216 non-null   int64  
 19  NumWebVisitsMonth    2216 non-null   int64  
 20  AcceptedCmp3         2216 non-null   int64  
 21  AcceptedCmp4         2216 non-null   int64  
 22  AcceptedCmp5         2216 non-null   int64  
 23  AcceptedCmp1         2216 non-null   int64  
 24  AcceptedCmp2         2216 non-null   int64  
 25  Complain             2216 non-null   int64  
 26  Z_CostContact        2216 non-null   int64  
 27  Z_Revenue            2216 non-null   int64  
 28  Response             2216 non-null   int64  
dtypes: float64(1), int64(25), object(3)
memory usage: 519.4+ KB
In [15]:
df.describe()
Out[15]:
ID Year_Birth Income Kidhome Teenhome Recency MntWines MntFruits MntMeatProducts MntFishProducts ... NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
count 2216.000000 2216.000000 2216.000000 2216.000000 2216.000000 2216.000000 2216.000000 2216.000000 2216.000000 2216.000000 ... 2216.000000 2216.000000 2216.000000 2216.000000 2216.000000 2216.000000 2216.000000 2216.0 2216.0 2216.000000
mean 5588.353339 1968.820397 52247.251354 0.441787 0.505415 49.012635 305.091606 26.356047 166.995939 37.637635 ... 5.319043 0.073556 0.074007 0.073105 0.064079 0.013538 0.009477 3.0 11.0 0.150271
std 3249.376275 11.985554 25173.076661 0.536896 0.544181 28.948352 337.327920 39.793917 224.283273 54.752082 ... 2.425359 0.261106 0.261842 0.260367 0.244950 0.115588 0.096907 0.0 0.0 0.357417
min 0.000000 1893.000000 1730.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.0 11.0 0.000000
25% 2814.750000 1959.000000 35303.000000 0.000000 0.000000 24.000000 24.000000 2.000000 16.000000 3.000000 ... 3.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.0 11.0 0.000000
50% 5458.500000 1970.000000 51381.500000 0.000000 0.000000 49.000000 174.500000 8.000000 68.000000 12.000000 ... 6.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.0 11.0 0.000000
75% 8421.750000 1977.000000 68522.000000 1.000000 1.000000 74.000000 505.000000 33.000000 232.250000 50.000000 ... 7.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.0 11.0 0.000000
max 11191.000000 1996.000000 666666.000000 2.000000 2.000000 99.000000 1493.000000 199.000000 1725.000000 259.000000 ... 20.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 3.0 11.0 1.000000

8 rows × 26 columns

In [16]:
df.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True)
In [17]:
df
Out[17]:
ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer Recency MntWines ... NumCatalogPurchases NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Response
0 5524 1957 Graduation Single 58138.0 0 0 4/9/2012 58 635 ... 10 4 7 0 0 0 0 0 0 1
1 2174 1954 Graduation Single 46344.0 1 1 8/3/2014 38 11 ... 1 2 5 0 0 0 0 0 0 0
2 4141 1965 Graduation Together 71613.0 0 0 21-08-2013 26 426 ... 2 10 4 0 0 0 0 0 0 0
3 6182 1984 Graduation Together 26646.0 1 0 10/2/2014 26 11 ... 0 4 6 0 0 0 0 0 0 0
4 5324 1981 PhD Married 58293.0 1 0 19-01-2014 94 173 ... 3 6 5 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2235 10870 1967 Graduation Married 61223.0 0 1 13-06-2013 46 709 ... 3 4 5 0 0 0 0 0 0 0
2236 4001 1946 PhD Together 64014.0 2 1 10/6/2014 56 406 ... 2 5 7 0 0 0 1 0 0 0
2237 7270 1981 Graduation Divorced 56981.0 0 0 25-01-2014 91 908 ... 3 13 6 0 1 0 0 0 0 0
2238 8235 1956 Master Together 69245.0 0 1 24-01-2014 8 428 ... 5 10 3 0 0 0 0 0 0 0
2239 9405 1954 PhD Married 52869.0 1 1 15-10-2012 40 84 ... 1 4 7 0 0 0 0 0 0 1

2216 rows × 27 columns

In [18]:
df['Age']=2024-df['Year_Birth']
df
Out[18]:
ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer Recency MntWines ... NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Response Age
0 5524 1957 Graduation Single 58138.0 0 0 4/9/2012 58 635 ... 4 7 0 0 0 0 0 0 1 67
1 2174 1954 Graduation Single 46344.0 1 1 8/3/2014 38 11 ... 2 5 0 0 0 0 0 0 0 70
2 4141 1965 Graduation Together 71613.0 0 0 21-08-2013 26 426 ... 10 4 0 0 0 0 0 0 0 59
3 6182 1984 Graduation Together 26646.0 1 0 10/2/2014 26 11 ... 4 6 0 0 0 0 0 0 0 40
4 5324 1981 PhD Married 58293.0 1 0 19-01-2014 94 173 ... 6 5 0 0 0 0 0 0 0 43
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2235 10870 1967 Graduation Married 61223.0 0 1 13-06-2013 46 709 ... 4 5 0 0 0 0 0 0 0 57
2236 4001 1946 PhD Together 64014.0 2 1 10/6/2014 56 406 ... 5 7 0 0 0 1 0 0 0 78
2237 7270 1981 Graduation Divorced 56981.0 0 0 25-01-2014 91 908 ... 13 6 0 1 0 0 0 0 0 43
2238 8235 1956 Master Together 69245.0 0 1 24-01-2014 8 428 ... 10 3 0 0 0 0 0 0 0 68
2239 9405 1954 PhD Married 52869.0 1 1 15-10-2012 40 84 ... 4 7 0 0 0 0 0 0 1 70

2216 rows × 28 columns

In [19]:
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], format='mixed')
In [20]:
df
Out[20]:
ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer Recency MntWines ... NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Response Age
0 5524 1957 Graduation Single 58138.0 0 0 2012-04-09 58 635 ... 4 7 0 0 0 0 0 0 1 67
1 2174 1954 Graduation Single 46344.0 1 1 2014-08-03 38 11 ... 2 5 0 0 0 0 0 0 0 70
2 4141 1965 Graduation Together 71613.0 0 0 2013-08-21 26 426 ... 10 4 0 0 0 0 0 0 0 59
3 6182 1984 Graduation Together 26646.0 1 0 2014-10-02 26 11 ... 4 6 0 0 0 0 0 0 0 40
4 5324 1981 PhD Married 58293.0 1 0 2014-01-19 94 173 ... 6 5 0 0 0 0 0 0 0 43
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2235 10870 1967 Graduation Married 61223.0 0 1 2013-06-13 46 709 ... 4 5 0 0 0 0 0 0 0 57
2236 4001 1946 PhD Together 64014.0 2 1 2014-10-06 56 406 ... 5 7 0 0 0 1 0 0 0 78
2237 7270 1981 Graduation Divorced 56981.0 0 0 2014-01-25 91 908 ... 13 6 0 1 0 0 0 0 0 43
2238 8235 1956 Master Together 69245.0 0 1 2014-01-24 8 428 ... 10 3 0 0 0 0 0 0 0 68
2239 9405 1954 PhD Married 52869.0 1 1 2012-10-15 40 84 ... 4 7 0 0 0 0 0 0 1 70

2216 rows × 28 columns

In [21]:
df.value_counts('Education')
Out[21]:
Education
Graduation    1116
PhD            481
Master         365
2n Cycle       200
Basic           54
Name: count, dtype: int64
In [22]:
plt.figure(figsize=(14,7))
sns.countplot(x='Education',data=df)
plt.xlabel('Education')
plt.show()
No description has been provided for this image
In [23]:
fig = px.bar(df, x = 'Education', y = "Income", color = "Education", title = "Education vs Income")
fig.update_layout(xaxis = {'categoryorder':'total descending'})
fig.show()
In [24]:
df.value_counts('Marital_Status')
Out[24]:
Marital_Status
Married     857
Together    573
Single      471
Divorced    232
Widow        76
Alone         3
Absurd        2
YOLO          2
Name: count, dtype: int64
In [25]:
plt.figure(figsize=(14,7))
sns.countplot(x='Marital_Status',data=df)
plt.xlabel('Marital_Status')
plt.show()
No description has been provided for this image
In [26]:
fig = px.bar(df, x = 'Marital_Status', y = "Income", color = "Marital_Status", title = "Marital_Status vs Income")
fig.update_layout(xaxis = {'categoryorder':'total descending'})
fig.show()
In [27]:
df.value_counts('AcceptedCmp1')
Out[27]:
AcceptedCmp1
0    2074
1     142
Name: count, dtype: int64
In [28]:
df.value_counts('AcceptedCmp2')
Out[28]:
AcceptedCmp2
0    2186
1      30
Name: count, dtype: int64
In [29]:
df.value_counts('AcceptedCmp3')
Out[29]:
AcceptedCmp3
0    2053
1     163
Name: count, dtype: int64
In [30]:
df.value_counts('AcceptedCmp4')
Out[30]:
AcceptedCmp4
0    2052
1     164
Name: count, dtype: int64
In [31]:
df.value_counts('AcceptedCmp5')
Out[31]:
AcceptedCmp5
0    2054
1     162
Name: count, dtype: int64
In [32]:
df.value_counts('Response')
Out[32]:
Response
0    1883
1     333
Name: count, dtype: int64
In [33]:
df.groupby("Marital_Status")["Recency"].sum().plot(kind = 'pie' ,autopct='%1.1f%%')
plt.title('Distribution of Recency by Marital Status')
plt.show()
No description has been provided for this image
In [34]:
plt.figure(figsize=(14,7))
sns.countplot(x='Kidhome',data=df,palette='Set1')
plt.xlabel('Kidhome')
plt.show()
No description has been provided for this image
In [35]:
plt.figure(figsize=(14,7))
sns.countplot(x='Teenhome',data=df, palette="Set2")
plt.xlabel('Teenhome')
plt.show()
No description has been provided for this image
In [36]:
df = df[df['Age'] <= 100]
df
Out[36]:
ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer Recency MntWines ... NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Response Age
0 5524 1957 Graduation Single 58138.0 0 0 2012-04-09 58 635 ... 4 7 0 0 0 0 0 0 1 67
1 2174 1954 Graduation Single 46344.0 1 1 2014-08-03 38 11 ... 2 5 0 0 0 0 0 0 0 70
2 4141 1965 Graduation Together 71613.0 0 0 2013-08-21 26 426 ... 10 4 0 0 0 0 0 0 0 59
3 6182 1984 Graduation Together 26646.0 1 0 2014-10-02 26 11 ... 4 6 0 0 0 0 0 0 0 40
4 5324 1981 PhD Married 58293.0 1 0 2014-01-19 94 173 ... 6 5 0 0 0 0 0 0 0 43
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2235 10870 1967 Graduation Married 61223.0 0 1 2013-06-13 46 709 ... 4 5 0 0 0 0 0 0 0 57
2236 4001 1946 PhD Together 64014.0 2 1 2014-10-06 56 406 ... 5 7 0 0 0 1 0 0 0 78
2237 7270 1981 Graduation Divorced 56981.0 0 0 2014-01-25 91 908 ... 13 6 0 1 0 0 0 0 0 43
2238 8235 1956 Master Together 69245.0 0 1 2014-01-24 8 428 ... 10 3 0 0 0 0 0 0 0 68
2239 9405 1954 PhD Married 52869.0 1 1 2012-10-15 40 84 ... 4 7 0 0 0 0 0 0 1 70

2213 rows × 28 columns

In [37]:
age_counts=df['Age'].value_counts().sort_values(ascending=False)
plt.figure(figsize=(14,7))
sns.countplot(x='Age',data=df,order=age_counts.index)
plt.xlabel('Age')
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image
In [38]:
sns.scatterplot(x='Age' ,y= 'Recency' ,data=df)
plt.title("Scatter Plot of Age vs. Recency")
plt.show()
No description has been provided for this image
In [39]:
plt.figure(figsize=(14,7))
sns.countplot(x='NumDealsPurchases',data=df)
plt.xlabel('NumDealsPurchases')
plt.show()
No description has been provided for this image
In [40]:
plt.figure(figsize=(14,7))
sns.countplot(x='NumStorePurchases',data=df)
plt.xlabel('NumStorePurchases')
plt.show()
No description has been provided for this image
In [41]:
plt.figure(figsize=(14,7))
sns.countplot(x='NumWebPurchases',data=df)
plt.xlabel('NumWebPurchases')
plt.show()
No description has been provided for this image
In [42]:
plt.figure(figsize=(14,7))
sns.countplot(x='NumCatalogPurchases',data=df)
plt.xlabel('NumCatalogPurchases')
plt.show()
No description has been provided for this image
In [43]:
plt.figure(figsize=(14,7))
sns.barplot(x='Marital_Status',y='NumStorePurchases',data=df, errorbar=('ci',0))
plt.xlabel('Marital_Status')
plt.ylabel('NumStorePurchases')
plt.title('Marital_Status vs NumStorePurchases')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [44]:
plt.figure(figsize=(14,7))
sns.barplot(x='Marital_Status',y='NumWebPurchases',data=df, errorbar=('ci',0))
plt.xlabel('Marital_Status')
plt.ylabel('NumWebPurchases')
plt.title('Marital_Status vs NumWebPurchases')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [45]:
plt.figure(figsize=(14,7))
sns.barplot(x='Marital_Status',y='NumCatalogPurchases',data=df, errorbar=('ci',0))
plt.xlabel('Marital_Status')
plt.ylabel('NumCatalogPurchases')
plt.title('Marital_Status vs NumCatalogPurchases')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [46]:
plt.figure(figsize=(14,7))
sns.barplot(x='Marital_Status',y='NumWebVisitsMonth',data=df, errorbar=('ci',0))
plt.xlabel('Marital_Status')
plt.ylabel('NumWebVisitsMonth')
plt.title('Marital_Status vs NumWebVisitsMonth')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [47]:
total_sum = df["NumDealsPurchases"].sum()
print(f"Total sum of NumDealsPurchases: {total_sum}")
Total sum of NumDealsPurchases: 5146
In [48]:
total_sum = df["NumWebPurchases"].sum()
print(f"Total sum of NumWebPurchases: {total_sum}")
Total sum of NumWebPurchases: 9046
In [49]:
total_sum = df["NumCatalogPurchases"].sum()
print(f"Total sum of NumCatalogPurchases: {total_sum}")
Total sum of NumCatalogPurchases: 5912
In [50]:
total_sum = df["NumStorePurchases"].sum()
print(f"Total sum of NumStorePurchases: {total_sum}")
Total sum of NumStorePurchases: 12847
In [51]:
total_sum = df["NumWebVisitsMonth"].sum()
print(f"Total sum of NumWebVisitsMonth: {total_sum}")
Total sum of NumWebVisitsMonth: 11777
In [52]:
data = df.groupby("Marital_Status")["MntMeatProducts"].sum().reset_index()
fig = px.pie(data, values='MntMeatProducts', names='Marital_Status', title='Meat Products Sum by Marital Status')
fig.show()
In [53]:
data = df.groupby("Marital_Status")["MntFruits"].sum().reset_index()
fig = px.pie(data, values='MntFruits', names='Marital_Status', title='Fruit Products Sum by Marital Status')
fig.show()
In [54]:
data = df.groupby("Marital_Status")["MntWines"].sum().reset_index()
fig = px.pie(data, values='MntWines', names='Marital_Status', title='wine Products Sum by Marital Status')
fig.show()
In [55]:
data = df.groupby("Marital_Status")["MntFishProducts"].sum().reset_index()
fig = px.pie(data, values='MntFishProducts', names='Marital_Status', title='Fish Products Sum by Marital Status')
fig.show()
In [56]:
data = df.groupby("Marital_Status")["MntSweetProducts"].sum().reset_index()
fig = px.pie(data, values='MntSweetProducts', names='Marital_Status', title='Sweet Products Sum by Marital Status')
fig.show()
In [57]:
data = df.groupby("Marital_Status")["MntGoldProds"].sum().reset_index()
fig = px.pie(data, values='MntGoldProds', names='Marital_Status', title='Gold Products Sum by Marital Status')
fig.show()
In [58]:
correlation = df[['Income','Kidhome','Teenhome','Recency','MntWines','MntFruits','MntMeatProducts','MntFishProducts','MntSweetProducts',
           'MntGoldProds','NumDealsPurchases','NumWebPurchases','NumCatalogPurchases','NumStorePurchases','NumWebVisitsMonth']].corr()
plt.figure(figsize=(10,8))
sns.heatmap(correlation, cmap='coolwarm')
for i in range(correlation.shape[0]):
    for j in range(correlation.shape[1]):
        plt.text(j+0.5, i+0.5, "{:.2f}".format(correlation.iloc[i, j]), ha='center', va='center', color='white')
plt.title('Correlation Heatmap')
plt.show()
No description has been provided for this image
In [59]:
fig = px.sunburst(df, path=['Education', 'Marital_Status'], values='Income', title='Sunburst Plot of Income by Education and Marital Status')
fig.show()
In [60]:
complain_counts = df['Complain'].value_counts()
plt.figure(figsize=(8,8))
complain_counts.plot(kind='pie', autopct='%1.1f%%')
plt.title('Distribution of Complains')
plt.ylabel('')
plt.show()
No description has been provided for this image
In [61]:
df.value_counts('Complain')
Out[61]:
Complain
0    2193
1      20
Name: count, dtype: int64
In [62]:
df.groupby("Education")["Complain"].sum().plot(kind = 'pie' ,autopct='%1.1f%%')
plt.title('Distribution of Complaints by Education')
plt.ylabel('')
plt.show()
No description has been provided for this image
In [63]:
df.groupby("Marital_Status")["Complain"].sum().plot(kind = 'pie' ,autopct='%1.1f%%')
plt.title('Distribution of Complaints by Marital_Status')
plt.show()
No description has been provided for this image
In [64]:
df.columns
Out[64]:
Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Response', 'Age'],
      dtype='object')
In [65]:
total_sum = df["MntWines"].sum()
print(f"Total sum of MntWines: {total_sum}")
Total sum of MntWines: 675305
In [66]:
total_sum = df["MntFruits"].sum()
print(f"Total sum of MntFruits: {total_sum}")
Total sum of MntFruits: 58255
In [67]:
total_sum = df["MntMeatProducts"].sum()
print(f"Total sum of MntMeatProducts: {total_sum}")
Total sum of MntMeatProducts: 369488
In [68]:
total_sum = df["MntFishProducts"].sum()
print(f"Total sum of MntFishProducts: {total_sum}")
Total sum of MntFishProducts: 83287
In [69]:
total_sum = df["MntSweetProducts"].sum()
print(f"Total sum of MntSweetProducts: {total_sum}")
Total sum of MntSweetProducts: 59828
In [70]:
total_sum = df["MntGoldProds"].sum()
print(f"Total sum of MntGoldProds: {total_sum}")
Total sum of MntGoldProds: 97176

Conclusion:¶

  1. The job with highest salary is Data Analytics lead.
  2. The job with second highest salary is Principal Data Engineer.
  3. The Employee who have Expert level experience gets the highest salary.
  4. The Employee with remote ratio is 100 has highest salary.
  5. The most common employment type here is full-time.
  6. The majority of people work in medium-sized companies.
  7. In 2022, Salary has increased than in 2021 and 2020.
  8. The most prevalent employee level is Senior Level.
  9. The Employee who has remote ratio 100 are the highest.
  10. Most of the workers residence is in US.
  11. Employee's who are working in a company located at RU gets the highest salary and the second is US.
  12. Most of the Employees are working in a company which are located at US.
  13. Most of the Employees are working as Data Scientist and Data Engineer.
  14. Employee's whose residence is in MY gets the highest salary.
  15. The majority of people work as full time gets the highest salary.
  16. In 2021, who had mid-level experience got highest salary.
  17. Employee who work in a large company gets the highest salary.
  18. The majority of people who has senior level experience receives highest salary.